{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "3XwVnvz2fJ7i"
   },
   "source": [
    "Parsing, Text Chunking and Indexing (Ex. 1): Fast Start to RAG\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "BfZOb3iPduL4",
    "outputId": "e49fb246-b37e-47ab-de50-45e1ed9a3cac"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting grpcio<=1.60.0,>=1.49.1\n",
      "  Downloading grpcio-1.60.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m24.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: grpcio\n",
      "Successfully installed grpcio-1.60.0\n"
     ]
    }
   ],
   "source": [
    "%pip install \"grpcio<=1.60.0,>=1.49.1\" --no-cache-dir ## just to be safe with enviroments\n",
    "!pip install -q llmware\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "c-irvLXJerCn"
   },
   "source": [
    "# ## If deploying locally, use these database options\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "8d4UxQgAe1Ve"
   },
   "outputs": [],
   "source": [
    "from llmware.configs import LLMWareConfig\n",
    "LLMWareConfig().set_active_db(\"sqlite\")\n",
    "LLMWareConfig().set_vector_db(\"faiss\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "-AigwWXpfq6i"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "from llmware.library import Library ### Import for library creating\n",
    "from llmware.retrieval import Query ### Import for querying\n",
    "from llmware.setup import Setup ### Import for setup\n",
    "from llmware.configs import LLMWareConfig ### Import for configs\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "6YYfDA6Wihic",
    "outputId": "32d302fa-bdb6-472b-fedd-51c3a9abe521"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Step 1 - creating library example1_library\n"
     ]
    }
   ],
   "source": [
    "# Initialize the library name for creating a new library in llmware\n",
    "library_name = \"example1_library\"\n",
    "print(f\"\\nStep 1 - creating library {library_name}\")\n",
    "\n",
    "# Create a new library instance with the specified name using llmware's Library class\n",
    "library = Library().create_new_library(library_name)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "id": "WX_SA0VdktcY"
   },
   "outputs": [],
   "source": [
    "sample_folders = [\"Agreements\", \"Invoices\", \"UN-Resolutions-500\", \"SmallLibrary\", \"FinDocs\", \"AgreementsLarge\"]\n",
    "selected_folder=sample_folders[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 36
    },
    "id": "7TPf54ecmhLx",
    "outputId": "30608dca-a616-43e6-94af-60e39f4d754a"
   },
   "outputs": [
    {
     "data": {
      "application/vnd.google.colaboratory.intrinsic+json": {
       "type": "string"
      },
      "text/plain": [
       "'Agreements'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected_folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Xa5PzIxBkciv",
    "outputId": "04357372-f0b2-4e28-9812-7087e8cd6c52"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 2 - loading the llmware sample files and saving at: /root/llmware_data/sample_files\n"
     ]
    }
   ],
   "source": [
    "sample_files_path = Setup().load_sample_files(over_write=False)\n",
    "print (f\"Step 2 - loading the llmware sample files and saving at: {sample_files_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "CqZC-z1aiNET",
    "outputId": "04f7a385-81ae-4cdd-ed95-d87588baae93"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 3 - parsing and indexing files from /root/llmware_data/sample_files/Agreements\n"
     ]
    }
   ],
   "source": [
    "sample_folder = \"Agreements\"  # Example folder\n",
    "ingestion_folder_path = os.path.join(sample_files_path, sample_folder)\n",
    "print(f\"Step 3 - parsing and indexing files from {ingestion_folder_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "7DCg64fGi8ix",
    "outputId": "86f74032-eee2-4c6c-cadd-c256d2d6abbf"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 4 - completed parsing - {'docs_added': 15, 'blocks_added': 1653, 'images_added': 0, 'pages_added': 204, 'tables_added': 0, 'rejected_files': []}\n"
     ]
    }
   ],
   "source": [
    "parsing_output = library.add_files(ingestion_folder_path)\n",
    "print(f\"Step 4 - completed parsing - {parsing_output}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "toMWVXDtjJ2k",
    "outputId": "5128b8d3-6653-4567-f93c-6b6012649a5a"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 5 - updated library card - documents - 15 - blocks - 1653 - {'_id': 1, 'library_name': 'example1_library', 'embedding': [{'embedding_status': 'no', 'embedding_model': 'none', 'embedding_db': 'none', 'embedded_blocks': 0, 'embedding_dims': 0, 'time_stamp': 'NA'}], 'knowledge_graph': 'no', 'unique_doc_id': 15, 'documents': 15, 'blocks': 1653, 'images': 0, 'pages': 204, 'tables': 0, 'account_name': 'llmware'}\n"
     ]
    }
   ],
   "source": [
    "updated_library_card = library.get_library_card()\n",
    "doc_count = updated_library_card[\"documents\"]\n",
    "block_count = updated_library_card[\"blocks\"]\n",
    "print(f\"Step 5 - updated library card - documents - {doc_count} - blocks - {block_count} - {updated_library_card}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "qm5nBqkEhhmx",
    "outputId": "2d991e1f-d74e-4cf9-f2a4-820de48ed337"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 6 - library artifacts - including extracted images - saved at folder path - /root/llmware_data/accounts/llmware/example1_library\n"
     ]
    }
   ],
   "source": [
    "library_path = library.library_main_path\n",
    "print(f\"Step 6 - library artifacts - including extracted images - saved at folder path - {library_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "gC42gipSj0z2",
    "outputId": "d5bdd2ed-33e7-4c55-96fc-18f7b714b169"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Step 7 - running a test query - base salary\n",
      "\n",
      "query results:  0 {'query': 'base salary', '_id': '25', 'text': \" Executive's base salary shall be reviewed annually   by the Board (or the compensation committee of the Board), pursuant to Employer's normal   compensation and performance review policies for senior level executives, and may be increased but not   decreased. The amount of any increase for each year shall be determined accordingly. For purposes of this   Agreement, the term “Base Salary” shall mean the amount of Executive's base salary established from   time to time pursuant to this Section 2.2.\", 'doc_ID': 1, 'block_ID': 24, 'page_num': 3, 'content_type': 'text', 'author_or_speaker': '', 'special_field1': '', 'file_source': 'Eileithyia EXECUTIVE EMPLOYMENT AGREEMENT.pdf', 'added_to_collection': 'Mon Apr 15 21:18:56 2024', 'table': '', 'coords_x': 1093, 'coords_y': -1825, 'coords_cx': 2, 'coords_cy': 134, 'external_files': '', 'score': -8.027992635076878, 'similarity': 0.0, 'distance': 0.0, 'matches': [[13, 'base'], [18, 'salary'], [379, 'base'], [384, 'salary'], [429, 'base'], [434, 'salary']], 'account_name': 'llmware', 'library_name': 'example1_library'}\n",
      "query results:  1 {'query': 'base salary', '_id': '250', 'text': \" Executive's base salary shall be reviewed annually   by the Board (or the compensation committee of the Board), pursuant to Employer's normal   compensation and performance review policies for senior level executives, and may be increased but not   decreased. The amount of any increase for each year shall be determined accordingly. For purposes of this   Agreement, the term “Base Salary” shall mean the amount of Executive's base salary established from   time to time pursuant to this Section 2.2.\", 'doc_ID': 3, 'block_ID': 26, 'page_num': 3, 'content_type': 'text', 'author_or_speaker': '', 'special_field1': '', 'file_source': 'Bia EXECUTIVE EMPLOYMENT AGREEMENT.pdf', 'added_to_collection': 'Mon Apr 15 21:18:56 2024', 'table': '', 'coords_x': 1093, 'coords_y': -1969, 'coords_cx': 2, 'coords_cy': 139, 'external_files': '', 'score': -8.027992635076878, 'similarity': 0.0, 'distance': 0.0, 'matches': [[13, 'base'], [18, 'salary'], [379, 'base'], [384, 'salary'], [429, 'base'], [434, 'salary']], 'account_name': 'llmware', 'library_name': 'example1_library'}\n",
      "query results:  2 {'query': 'base salary', '_id': '362', 'text': \" Executive's base salary shall be reviewed annually   by the Board (or the compensation committee of the Board), pursuant to Employer's normal   compensation and performance review policies for senior level executives, and may be increased but not   decreased. The amount of any increase for each year shall be determined accordingly. For purposes of this   Agreement, the term “Base Salary” shall mean the amount of Executive's base salary established from   time to time pursuant to this Section 2.2.\", 'doc_ID': 4, 'block_ID': 24, 'page_num': 3, 'content_type': 'text', 'author_or_speaker': '', 'special_field1': '', 'file_source': 'Leto EXECUTIVE EMPLOYMENT AGREEMENT.pdf', 'added_to_collection': 'Mon Apr 15 21:18:56 2024', 'table': '', 'coords_x': 1093, 'coords_y': -1825, 'coords_cx': 2, 'coords_cy': 134, 'external_files': '', 'score': -8.027992635076878, 'similarity': 0.0, 'distance': 0.0, 'matches': [[13, 'base'], [18, 'salary'], [379, 'base'], [384, 'salary'], [429, 'base'], [434, 'salary']], 'account_name': 'llmware', 'library_name': 'example1_library'}\n",
      "query results:  3 {'query': 'base salary', '_id': '469', 'text': \" Executive's base salary shall be reviewed annually   by the Board (or the compensation committee of the Board), pursuant to Employer's normal   compensation and performance review policies for senior level executives, and may be increased but not   decreased. The amount of any increase for each year shall be determined accordingly. For purposes of this   Agreement, the term “Base Salary” shall mean the amount of Executive's base salary established from   time to time pursuant to this Section 2.2.\", 'doc_ID': 5, 'block_ID': 24, 'page_num': 3, 'content_type': 'text', 'author_or_speaker': '', 'special_field1': '', 'file_source': 'Persephone EXECUTIVE EMPLOYMENT AGREEMENT.pdf', 'added_to_collection': 'Mon Apr 15 21:18:56 2024', 'table': '', 'coords_x': 1093, 'coords_y': -1825, 'coords_cx': 2, 'coords_cy': 134, 'external_files': '', 'score': -8.027992635076878, 'similarity': 0.0, 'distance': 0.0, 'matches': [[13, 'base'], [18, 'salary'], [379, 'base'], [384, 'salary'], [429, 'base'], [434, 'salary']], 'account_name': 'llmware', 'library_name': 'example1_library'}\n",
      "query results:  4 {'query': 'base salary', '_id': '576', 'text': \" Executive's base salary shall be reviewed annually   by the Board (or the compensation committee of the Board), pursuant to Employer's normal   compensation and performance review policies for senior level executives, and may be increased but not   decreased. The amount of any increase for each year shall be determined accordingly. For purposes of this   Agreement, the term “Base Salary” shall mean the amount of Executive's base salary established from   time to time pursuant to this Section 2.2.\", 'doc_ID': 6, 'block_ID': 24, 'page_num': 3, 'content_type': 'text', 'author_or_speaker': '', 'special_field1': '', 'file_source': 'Demeter EXECUTIVE EMPLOYMENT AGREEMENT.pdf', 'added_to_collection': 'Mon Apr 15 21:18:56 2024', 'table': '', 'coords_x': 1093, 'coords_y': -1825, 'coords_cx': 2, 'coords_cy': 136, 'external_files': '', 'score': -8.027992635076878, 'similarity': 0.0, 'distance': 0.0, 'matches': [[13, 'base'], [18, 'salary'], [379, 'base'], [384, 'salary'], [429, 'base'], [434, 'salary']], 'account_name': 'llmware', 'library_name': 'example1_library'}\n",
      "query results:  5 {'query': 'base salary', '_id': '685', 'text': \" Executive's base salary shall be reviewed annually   by the Board (or the compensation committee of the Board), pursuant to Employer's normal   compensation and performance review policies for senior level executives, and may be increased but not   decreased. The amount of any increase for each year shall be determined accordingly. For purposes of this   Agreement, the term “Base Salary” shall mean the amount of Executive's base salary established from   time to time pursuant to this Section 2.2.\", 'doc_ID': 7, 'block_ID': 24, 'page_num': 3, 'content_type': 'text', 'author_or_speaker': '', 'special_field1': '', 'file_source': 'Nyx EXECUTIVE EMPLOYMENT AGREEMENT.pdf', 'added_to_collection': 'Mon Apr 15 21:18:56 2024', 'table': '', 'coords_x': 1093, 'coords_y': -1825, 'coords_cx': 2, 'coords_cy': 134, 'external_files': '', 'score': -8.027992635076878, 'similarity': 0.0, 'distance': 0.0, 'matches': [[13, 'base'], [18, 'salary'], [379, 'base'], [384, 'salary'], [429, 'base'], [434, 'salary']], 'account_name': 'llmware', 'library_name': 'example1_library'}\n",
      "query results:  6 {'query': 'base salary', '_id': '794', 'text': \" Executive's base salary shall be reviewed annually   by the Board (or the compensation committee of the Board), pursuant to Employer's normal   compensation and performance review policies for senior level executives, and may be increased but not   decreased. The amount of any increase for each year shall be determined accordingly. For purposes of this   Agreement, the term “Base Salary” shall mean the amount of Executive's base salary established from   time to time pursuant to this Section 2.2.\", 'doc_ID': 8, 'block_ID': 26, 'page_num': 3, 'content_type': 'text', 'author_or_speaker': '', 'special_field1': '', 'file_source': 'Apollo EXECUTIVE EMPLOYMENT AGREEMENT.pdf', 'added_to_collection': 'Mon Apr 15 21:18:56 2024', 'table': '', 'coords_x': 1093, 'coords_y': -1969, 'coords_cx': 2, 'coords_cy': 133, 'external_files': '', 'score': -8.027992635076878, 'similarity': 0.0, 'distance': 0.0, 'matches': [[13, 'base'], [18, 'salary'], [379, 'base'], [384, 'salary'], [429, 'base'], [434, 'salary']], 'account_name': 'llmware', 'library_name': 'example1_library'}\n",
      "query results:  7 {'query': 'base salary', '_id': '906', 'text': \" Executive's base salary shall be reviewed annually   by the Board (or the compensation committee of the Board), pursuant to Employer's normal   compensation and performance review policies for senior level executives, and may be increased but not   decreased. The amount of any increase for each year shall be determined accordingly. For purposes of this   Agreement, the term “Base Salary” shall mean the amount of Executive's base salary established from   time to time pursuant to this Section 2.2.\", 'doc_ID': 9, 'block_ID': 24, 'page_num': 3, 'content_type': 'text', 'author_or_speaker': '', 'special_field1': '', 'file_source': 'Rhea EXECUTIVE EMPLOYMENT AGREEMENT.pdf', 'added_to_collection': 'Mon Apr 15 21:18:56 2024', 'table': '', 'coords_x': 1093, 'coords_y': -1825, 'coords_cx': 2, 'coords_cy': 134, 'external_files': '', 'score': -8.027992635076878, 'similarity': 0.0, 'distance': 0.0, 'matches': [[13, 'base'], [18, 'salary'], [379, 'base'], [384, 'salary'], [429, 'base'], [434, 'salary']], 'account_name': 'llmware', 'library_name': 'example1_library'}\n",
      "query results:  8 {'query': 'base salary', '_id': '1013', 'text': \" Executive's base salary shall be reviewed annually   by the Board (or the compensation committee of the Board), pursuant to Employer's normal   compensation and performance review policies for senior level executives, and may be increased but not   decreased. The amount of any increase for each year shall be determined accordingly. For purposes of this   Agreement, the term “Base Salary” shall mean the amount of Executive's base salary established from   time to time pursuant to this Section 2.2.\", 'doc_ID': 10, 'block_ID': 24, 'page_num': 3, 'content_type': 'text', 'author_or_speaker': '', 'special_field1': '', 'file_source': 'Metis EXECUTIVE EMPLOYMENT AGREEMENT.pdf', 'added_to_collection': 'Mon Apr 15 21:18:56 2024', 'table': '', 'coords_x': 1093, 'coords_y': -1825, 'coords_cx': 2, 'coords_cy': 135, 'external_files': '', 'score': -8.027992635076878, 'similarity': 0.0, 'distance': 0.0, 'matches': [[13, 'base'], [18, 'salary'], [379, 'base'], [384, 'salary'], [429, 'base'], [434, 'salary']], 'account_name': 'llmware', 'library_name': 'example1_library'}\n",
      "query results:  9 {'query': 'base salary', '_id': '1122', 'text': \" Executive's base salary shall be reviewed annually   by the Board (or the compensation committee of the Board), pursuant to Employer's normal   compensation and performance review policies for senior level executives, and may be increased but not   decreased. The amount of any increase for each year shall be determined accordingly. For purposes of this   Agreement, the term “Base Salary” shall mean the amount of Executive's base salary established from   time to time pursuant to this Section 2.2.\", 'doc_ID': 11, 'block_ID': 26, 'page_num': 3, 'content_type': 'text', 'author_or_speaker': '', 'special_field1': '', 'file_source': 'Athena EXECUTIVE EMPLOYMENT AGREEMENT.pdf', 'added_to_collection': 'Mon Apr 15 21:18:56 2024', 'table': '', 'coords_x': 1093, 'coords_y': -1969, 'coords_cx': 2, 'coords_cy': 133, 'external_files': '', 'score': -8.027992635076878, 'similarity': 0.0, 'distance': 0.0, 'matches': [[13, 'base'], [18, 'salary'], [379, 'base'], [384, 'salary'], [429, 'base'], [434, 'salary']], 'account_name': 'llmware', 'library_name': 'example1_library'}\n"
     ]
    }
   ],
   "source": [
    "test_query = \"base salary\"\n",
    "print(f\"\\nStep 7 - running a test query - {test_query}\\n\")\n",
    "\n",
    "query_results = Query(library).text_query(test_query, result_count=10)\n",
    "\n",
    "for i, result in enumerate(query_results):\n",
    "    print(\"query results: \", i, result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "CP2UdFAoiPOf"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "L4",
   "machine_shape": "hm",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "name": "python3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
